In [1]:
import numpy as np
import sklearn
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib 


def is_covariance(x):
    # checks if x is a valid covariance matrix (symmetric and pos-def)
    symmetric= np.all(x==x.T)
    pos_def=np.all(np.linalg.eigvals(x) > 0)
    return symmetric and pos_def



def plot_data(x,title,eigen=False):
    # plot 3d data
    # x must be of size (n,3)
    # if eigen=True, also plot eigenvectors of cov(x), with the corresponding eigenvalue
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(x[:,0],x[:,1],zs=x[:,2],alpha=0.1)
    limit_max=np.max(x)+1
    limit_min=np.min(x)-1
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_zlabel("z")
    ax.set_xlim(limit_min, limit_max)
    ax.set_ylim(limit_min, limit_max)
    ax.set_zlim(limit_min, limit_max)
    ax.set_title(title)
    if eigen==True:
        n,features=x.shape
        mu=np.mean(x,axis=0)
        pca = PCA(n_components=features)
        pca.fit(x)
        eigenvectors=pca.components_ # each row is an eigenvector
        eigenvalues=pca.explained_variance_ # eigenvalues
        
        scale=np.abs(x).max()
        #scaled_eigenvectors=eigenvectors/(eigenvalues+0.2)*scale
        scaled_eigenvectors=eigenvectors*scale/4
        ax.scatter([mu[0]],[mu[1]],zs=[mu[2]],color="green",s=150)
        endpoints=scaled_eigenvectors+mu
        for i in range(features):
            ax.plot([mu[0], endpoints[i,0]], [mu[1],endpoints[i,1]], [mu[2], endpoints[i,2]], color='red', alpha=0.8, lw=2)
            ax.text(endpoints[i,0],endpoints[i,1],endpoints[i,2],  '$\lambda_{%d}$=%0.2f' % (i,eigenvalues[i]), size=8, zorder=1)


Using matplotlib backend: TkAgg

Generate data to simulate a dataset of samples $x$ in which all features/columns (3) could be collected. x has size $n$ by $features$.


In [2]:
#mean and std of multivariate normal dist to generate samples
mu=np.array([5,0,-2])
σ=np.array([[9,1, -1],
            [1, 3, -2],
            [-1, -2,2],])
if not is_covariance(σ):
    print("Warning: σ is not a valid covariance matrix (not symmetric or positive definite)")

n=1000 # number of samples
x=np.random.multivariate_normal(mu,σ,size=(n,))

#plot generated data
plt.close("all")
plot_data(x,"data in original space")
plt.show()

Generate another dataset $y$ with the same distribution as $x$ (this is a very strong assumption!)


In [3]:
y=np.random.multivariate_normal(mu,σ,size=(n,))
plot_data(y,"y in original space")

Lets simulate the fact that for $y$ we can't measure all values. In this case, we will create y_missing, which only has 2 features


In [4]:
y_missing=y[:,0:2]
plt.figure()
plt.scatter(y_missing[:,0],y_missing[:,1])
plt.title("y_missing in original space (2d)")


Out[4]:
Text(0.5,1,'y_missing in original space (2d)')

Now, lets assume that we can recover the last feature of y using a Linear Regression model trained on $x$.


In [5]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x[:,0:2],x[:,2])

y3=model.predict(y_missing)

We can now create a reconstructed version of y, with the 2 original columns (y_missing) and the predicted value (y_3)


In [6]:
y3=y3.reshape((-1,1))
print(y_missing.shape,y3.shape)
y_reconstructed=np.hstack([y_missing,y3])
plot_data(y_reconstructed, "y_reconstructed")


(1000, 2) (1000, 1)

In [7]:
mean_reconstruction_error=((y_reconstructed-y)**2).sum()/n
print(mean_reconstruction_error)


0.642963027543